{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "e64cee95",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import pandas as pd\n",
    "from neo4j import GraphDatabase, basic_auth\n",
    "from dotenv import load_dotenv, find_dotenv\n",
    "import os\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "613ebae1",
   "metadata": {},
   "outputs": [],
   "source": [
    "load_dotenv(os.path.join(os.path.expanduser('~'), '.neo4j_config.env'))\n",
    "\n",
    "SPOKE_USER = os.environ.get('SPOKE_USER')\n",
    "SPOKE_PASSWORD = os.environ.get('SPOKE_PSW')\n",
    "URI = os.environ.get('SPOKE_URI')\n",
    "\n",
    "auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)\n",
    "sdb = GraphDatabase.driver(URI, auth=auth)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "8c0fdb36",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../data/disease_with_relation_to_genes.pickle', 'rb') as f:\n",
    "    disease_names = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "04534ff5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 176 ms, sys: 23.3 ms, total: 199 ms\n",
      "Wall time: 607 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "query = f'''\n",
    "    MATCH(d:Disease)\n",
    "    WHERE d.name IN {disease_names}\n",
    "    RETURN d.identifier AS d_id, d.name AS d_name\n",
    "'''\n",
    "with sdb.session() as session:\n",
    "    with session.begin_transaction() as tx:\n",
    "        result = tx.run(query)\n",
    "        out_list = []\n",
    "        for row in result:\n",
    "            out_list.append((row['d_id'], row['d_name']))\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "3bf896fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "disease_df = pd.DataFrame(out_list, columns=['disease_id', 'disease_name'])\n",
    "disease_name_df = pd.DataFrame(disease_names,columns=['disease_name'])\n",
    "\n",
    "disease_df = pd.merge(disease_name_df, disease_df, on='disease_name', how='left').drop_duplicates()\n",
    "\n",
    "disease_df.to_csv('../data/disease_name_with_id.csv', index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "a3a2139a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>disease_name</th>\n",
       "      <th>disease_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>serum amyloid A amyloidosis</td>\n",
       "      <td>DOID:0080936</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>erythroleukemia</td>\n",
       "      <td>DOID:0080916</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>spondylometaphyseal dysplasia Sedaghatian type</td>\n",
       "      <td>DOID:0112298</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>cerebrooculofacioskeletal syndrome 2</td>\n",
       "      <td>DOID:0080912</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>bilateral frontoparietal polymicrogyria</td>\n",
       "      <td>DOID:0080922</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6305</th>\n",
       "      <td>graft-versus-host disease</td>\n",
       "      <td>DOID:0081267</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6306</th>\n",
       "      <td>acute myeloid leukemia with maturation</td>\n",
       "      <td>DOID:0081087</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6307</th>\n",
       "      <td>frontonasal dysplasia</td>\n",
       "      <td>DOID:0081044</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6308</th>\n",
       "      <td>central diabetes insipidus</td>\n",
       "      <td>DOID:0081055</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6309</th>\n",
       "      <td>acute myelomonocytic leukemia</td>\n",
       "      <td>DOID:0081082</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6306 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        disease_name    disease_id\n",
       "0                        serum amyloid A amyloidosis  DOID:0080936\n",
       "1                                    erythroleukemia  DOID:0080916\n",
       "2     spondylometaphyseal dysplasia Sedaghatian type  DOID:0112298\n",
       "3               cerebrooculofacioskeletal syndrome 2  DOID:0080912\n",
       "4            bilateral frontoparietal polymicrogyria  DOID:0080922\n",
       "...                                              ...           ...\n",
       "6305                       graft-versus-host disease  DOID:0081267\n",
       "6306          acute myeloid leukemia with maturation  DOID:0081087\n",
       "6307                           frontonasal dysplasia  DOID:0081044\n",
       "6308                      central diabetes insipidus  DOID:0081055\n",
       "6309                   acute myelomonocytic leukemia  DOID:0081082\n",
       "\n",
       "[6306 rows x 2 columns]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "disease_df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
